import csv
import re

import numpy
import matplotlib
import pandas
import requests
url="https://casad.cas.cn/ysxx2022/ysmd/qtys/"  #要爬取网站的url
dic={
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Mobile Safari/537.36 Edg/126.0.0.0"
}#伪装ua，防止反爬
resp = requests.get(url,headers=dic)
resp.encoding = "utf-8"  #解决乱码格式
obj1 = re.compile(r'<div class="rmbs_a">(?P<ul>.*?)</div>', re.S)#爬取的是超链接页面
obj2= re.compile(r'<a href="(?P<href>.*?)"')#爬取具体的链接
obj3= re.compile(r'<script>.*?var currentChannel1 = "(?P<bumen>.*?)"'
                 r'.*?<p class="wztitle">(?P<name>.*?)</p>'
                 r'.*?<div class="acadImg"><img src="(?P<imgul>)"'
                 r'<div class="acadTxt">.*>(?P<test>.*?)家'
                 r'.*?从事(?P<work>.*?)。'
                 r'(?P<born>.*?)生于(?P<local>.*?)。.*?。(?P<year>.*?当选).*?'
                 r'</p></div></p>',re.S)
String = resp.text
# print(resp.text)
result1 = obj1.finditer(String) #爬取的是院士详情信息子页面的超链接
f = open("data01.csv",mode="w",encoding="utf-8") #准备csv文件
csvwriter = csv.writer(f) #准备一个csv Writer
child_href_list=[]#将链接存放在列表中
for it in result1:
    dic = it.groupdict()
    #print(it.group("ul")) #用于输出检查
    ul = it.group("ul")
    result2 = obj2.finditer(ul)
    # print(ul)
    for itt in result2:
        print(itt.group("href"))  # 输出检查
        child_href_list.append(itt.group("href"))
    print("over")
    csvwriter.writerow(dic.values())
print("over flag")
f.close()



#跳转子页面获取子页面信息
a = 0
for href in child_href_list:
    a = a + 1
    child_resp = requests.get(href)
    child_resp.encoding = "utf-8"
    print("NO.",a)
    print(child_resp.text)
    resultT = obj3.search(child_resp.text)
    if resultT is None:
        a=a
        continue

    print(resultT.group("test"))

    # break#用于测试
print(a)
# with open("child_href_test.html",mode="w",encoding="utf-8") as f:
#     f.write(child_resp.text)
